library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)


options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")

Load social distancing data and blockgroups

Load the Safegraph social distancing data and San Jose blockgroups

# get SJ blockgroups 
# get San Jose block groups
scc_blockgroups <- block_groups("CA","Santa Clara", cb=F, progress_bar=F)

# Use tracts sent to us by San Jose
sj_tracts <- st_read("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Data Library/San_Jose/CSJ_Census_Tracts/CSJ_Census_Tracts.shp") %>%
  st_as_sf() %>%
  st_transform(st_crs(scc_blockgroups))
## Reading layer `CSJ_Census_Tracts' from data source `/Users/simonespeizer/pCloud Drive/Shared/SFBI/Data Library/San_Jose/CSJ_Census_Tracts/CSJ_Census_Tracts.shp' using driver `ESRI Shapefile'
## Simple feature collection with 219 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 6112856 ymin: 1869687 xmax: 6255982 ymax: 1996555
## epsg (SRID):    2227
## proj4string:    +proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000.0001016 +y_0=500000.0001016001 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=us-ft +no_defs
sj_citycouncil_disticts <- st_read("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Data Library/San_Jose/City Council Districts/CITY_COUNCIL_DISTRICTS.shp") %>%
  st_as_sf() %>%
  st_transform(st_crs(scc_blockgroups))
## Reading layer `CITY_COUNCIL_DISTRICTS' from data source `/Users/simonespeizer/pCloud Drive/Shared/SFBI/Data Library/San_Jose/City Council Districts/CITY_COUNCIL_DISTRICTS.shp' using driver `ESRI Shapefile'
## Simple feature collection with 10 features and 7 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 6112856 ymin: 1869687 xmax: 6255982 ymax: 1996555
## epsg (SRID):    2227
## proj4string:    +proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000.0001016 +y_0=500000.0001016001 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=us-ft +no_defs
# from code written by others to get SJ blockgroups
sj_blockgroups <-
  scc_blockgroups %>%
  st_centroid() %>%
  st_join(sj_tracts, left = F) %>%
  st_join(sj_citycouncil_disticts%>% dplyr::select(DISTRICTS)) %>%
  mutate(
    DISTRICTS = DISTRICTS %>% factor(levels = c("1","2","3","4","5","6","7","8","9","10"))
  ) %>%
  st_set_geometry(NULL) %>%
  left_join(scc_blockgroups%>% dplyr::select(GEOID), by = "GEOID") %>%
  st_as_sf() %>%
  dplyr::select(GEOID, DISTRICTS)

# the spatial join leaves off two blockgroups which are touching district 9. The following code assigns those to district 9
sj_blockgroups$DISTRICTS[is.na(sj_blockgroups$DISTRICTS)] <- 9

# code from others in the class to get social distancing data 
sj_socialdistancing <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/sj_socialdistancing.rds") %>% 
  mutate(date = date_range_start %>%  substr(1,10) %>% as.Date()) %>% 
  left_join(sj_blockgroups, by = c("origin_census_block_group" = "GEOID")) %>% 
  filter(!is.na(DISTRICTS))

# obtaining weekends
weekends <-
  sj_socialdistancing %>% 
  filter(!duplicated(date)) %>% 
  arrange(date) %>% 
  mutate(
    weekend = 
      ifelse(
        (date %>% as.numeric()) %% 7 %in% c(2,3),
        T,
        F
      )
  ) %>% 
  dplyr::select(date,weekend)

sj_socialdistancing <- 
  sj_socialdistancing %>% 
  left_join(weekends)

# date of the shelter in place order
shelter_start <- "2020-03-16" %>% as.Date()

# get average staying at home on weekdays in January and February
sj_pre_sd_at_home_average <- sj_socialdistancing %>% 
  filter(weekend == F) %>% 
  filter(date <  as.Date("2020-03-01")) %>%
  group_by(origin_census_block_group) %>% 
  summarize(completely_home_device_count = sum(completely_home_device_count), device_count = sum(device_count)) %>% 
  mutate(`% Completely at Home Pre Shelter` = (completely_home_device_count/device_count*100) %>% round(1))

Obtaining demographic variables

Here I obtain various demographic data, including income (percent below 50% and 80% of area median income), vehicle ownership, age, English language ability, and occupants per room.

# obtain the saved census data 
setwd("~/Documents/2020 Spring Quarter/CEE 218Z")
acs_vars = readRDS("censusData2018_acs_acs5.rds")
setwd("~/Documents/2020 Spring Quarter/CEE 218Z/covid19")
# load in income data - code adapted from other students
sj_median_income_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "B19013_001E"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  rename(
    Median_Income = B19013_001E 
  ) %>% 
  filter(!is.na(Median_Income)) %>% 
  left_join(sj_blockgroups, by = c("blockgroup" = "GEOID")) %>% #this code gives each blockgroup a district designation
  filter(
    !is.na(DISTRICTS)
  ) %>% 
  
  # this code joins our census data with the social distancing data, processed as shown below
  left_join(sj_socialdistancing %>%  
                          filter(weekend == F) %>% 
                          filter(date > shelter_start) %>%
                          group_by(origin_census_block_group) %>% 
                          summarize(
                                    completely_home_device_count = sum(completely_home_device_count),
                                    device_count = sum(device_count)) %>% 
                          mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1)),
            by = c("blockgroup" = "origin_census_block_group")
  ) %>% 
  filter(
    !is.na(device_count)
  ) %>% 
  left_join(sj_pre_sd_at_home_average %>% dplyr::select(origin_census_block_group, `% Completely at Home Pre Shelter`), by = c("blockgroup" = "origin_census_block_group"))

sj_ami_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B19001)"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
  group_by(blockgroup) %>% 
  summarize(
    Total = B19001_001E,
    `Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
    #sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
    `Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E)
  ) %>% 
  mutate(
    `% under 75,000` = `Under 75,000` / Total * 100,
    `% under 100,000` = `Under 100,000` / Total * 100
  ) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
  ) %>% 
  filter(!is.na(device_count))
# loading in language data - code adapted from other students
sj_lang_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B16004)"
  )  %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(
    key = "variable",
    value = "estimate", 
    - blockgroup
  ) %>% 
  left_join(acs_vars, by = c("variable" = "name")) %>% 
  mutate(
    tier = substr(label,lapply(label, function(x) max(unlist(gregexpr('!!',x)))+2),nchar(label))
  ) %>% 
  filter(tier %in% c('Speak English "not well"', 
                     'Speak English "not at all"', 
                     'Total', 'Speak Spanish', 
                     'Speak Asian and Pacific Island languages')) %>% 
  group_by(blockgroup, tier) %>% 
  summarise(
    estimate1 = sum(estimate)
  ) %>% 
  spread(
    key = "tier",
    value = "estimate1"
  ) %>% 
  mutate(
    `% speaking english < well` = (`Speak English "not well"` + `Speak English "not at all"`) / Total * 100,
    `% speaking spanish` = (`Speak Spanish`/ Total) * 100,
    `% speaking api` = (`Speak Asian and Pacific Island languages` / Total) * 100
  ) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>% 
  filter(!is.na(device_count)) %>% 
  mutate(log_perc = log(`% speaking english < well`))
# loading in age data - specifically looking at percentage 65+ and percentage <30
sj_age_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B01001)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(
    key = "variable",
    value = "estimate", 
    - blockgroup
  ) %>% 
  mutate(
    label = acs_vars$label[match(variable,acs_vars$name)]
  ) %>% 
  select(-variable) %>% 
  separate(
    label,
    into = c(NA,NA,"sex","age"),
    sep = "!!"
  ) %>% filter(!is.na(age)) %>% 
  mutate(elderly = ifelse(age %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"), estimate, NA), `less than 30` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years", "18 and 19 years", "20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA)) %>% 
  group_by(blockgroup) %>% 
  summarize(elderly = sum(elderly, na.rm = T), `less than 30` = sum(`less than 30`, na.rm = T), total = sum(estimate, na.rm = T)) %>% 
  mutate(`percent elderly` = elderly*100 / total, `percent less than 30` = `less than 30`*100 / total) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>% 
  filter(!is.na(device_count)) 
# get data on vehicles available as vehicles allocation
sj_vehicles_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B992512)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  dplyr::select(B992512_001E, blockgroup) %>%
  rename(total_vehicles = B992512_001E, blockgroup = blockgroup) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  mutate(`vehicles per capita` = total_vehicles / total) %>%
  filter(!is.na(device_count)) 

# also get data on vehicles available as households without a vehicle
sj_no_vehicles_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B25044)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>%
  separate(label, into = c(NA, NA, NA,"vehicles"), sep = "!!") %>% 
  filter(!is.na(vehicles)) %>%
  group_by(blockgroup, vehicles) %>%
  summarize(grouped_vehicles = sum(estimate)) %>%
  spread(key = vehicles, value = grouped_vehicles) %>%
  mutate(total_nums = `1 vehicle available` + `2 vehicles available` + `3 vehicles available` + `4 vehicles available` + `5 or more vehicles available` + `No vehicle available`, `percent no vehicles` = `No vehicle available`*100 / total_nums, `log percent` = log(`percent no vehicles`)) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))
# get data on occupants per room
sj_occupants_per_room_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B25014)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>% 
  filter(!is.na(`occupants per room`)) %>%
  group_by(blockgroup, `occupants per room`) %>%
  summarize(estimate_tot = sum(estimate)) %>% 
  spread(key = `occupants per room`, value = estimate_tot) %>%
  mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`, `percent 1 or more` = (`1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) * 100/ total_nums) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count)) 

Testing correlations

In the plots below, I show the selected variables against percent of devices completely at home since the shelter-in-place order started, as well as against percent of devices pre-shelter-in-place for comparison.

Age:

# age
sj_age_by_block %>%
  ggplot(aes(
  x = `percent less than 30`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
labs(
    x = "Percent of residents younger than 30",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Young Age Groups"
  )

young_model <- lm(sj_age_by_block$`% Completely at Home` ~ sj_age_by_block$`percent less than 30`)
summary(young_model)
## 
## Call:
## lm(formula = sj_age_by_block$`% Completely at Home` ~ sj_age_by_block$`percent less than 30`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.272  -4.473   0.233   4.833  28.159 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            57.47212    1.52328  37.729  < 2e-16 ***
## sj_age_by_block$`percent less than 30` -0.20725    0.03861  -5.367 1.17e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.352 on 567 degrees of freedom
## Multiple R-squared:  0.04835,    Adjusted R-squared:  0.04667 
## F-statistic: 28.81 on 1 and 567 DF,  p-value: 1.167e-07
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
  ggplot(aes(
  x = `percent elderly`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of residents ages 65 and older",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Elderly Population"
  )

elderly_model <- lm(`% Completely at Home` ~ `percent elderly`, sj_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent elderly`, data = sj_age_by_block %>% 
##     filter(`percent elderly` < 50))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.632  -4.680   0.259   5.108  28.119 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       47.02191    0.79188   59.38  < 2e-16 ***
## `percent elderly`  0.19522    0.05468    3.57 0.000387 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.456 on 564 degrees of freedom
## Multiple R-squared:  0.0221, Adjusted R-squared:  0.02037 
## F-statistic: 12.75 on 1 and 564 DF,  p-value: 0.0003869
# compare this to pre-shelter-in-place behavior
sj_age_by_block %>%
  ggplot(aes(
  x = `percent less than 30`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
labs(
    x = "Percent of residents younger than 30",
    y = "Percent devices completely at home pre-shelter-in-place",
    title = "San Jose: Staying at Home and Young Age Groups Pre Shelter-in-Place"
  )

young_model2 <- lm(sj_age_by_block$`% Completely at Home Pre Shelter` ~ sj_age_by_block$`percent less than 30`)
summary(young_model2)
## 
## Call:
## lm(formula = sj_age_by_block$`% Completely at Home Pre Shelter` ~ 
##     sj_age_by_block$`percent less than 30`)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.9202  -2.9456   0.1802   2.4524  27.0472 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            17.49842    0.77360  22.620  < 2e-16 ***
## sj_age_by_block$`percent less than 30`  0.09199    0.01961   4.691 3.41e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.242 on 567 degrees of freedom
## Multiple R-squared:  0.03736,    Adjusted R-squared:  0.03566 
## F-statistic: 22.01 on 1 and 567 DF,  p-value: 3.41e-06
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
  ggplot(aes(
  x = `percent elderly`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of residents ages 65 and older",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "San Jose: Staying at Home and Elderly Population Pre Shelter-in-Place"
  )

elderly_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent elderly`, sj_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent elderly`, 
##     data = sj_age_by_block %>% filter(`percent elderly` < 50))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.2215  -3.0147   0.1708   2.5173  26.2336 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       22.55815    0.39776  56.713  < 2e-16 ***
## `percent elderly` -0.11900    0.02746  -4.333 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.247 on 564 degrees of freedom
## Multiple R-squared:  0.03222,    Adjusted R-squared:  0.0305 
## F-statistic: 18.77 on 1 and 564 DF,  p-value: 1.742e-05

Income:

# income - less than $75000
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% under 75,000`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Households Below 50% AMI"
  )

income_75_model <- lm(`% Completely at Home` ~ `% under 75,000`, sj_ami_by_block)
summary(income_75_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `% under 75,000`, data = sj_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.789  -4.168   0.541   4.649  20.690 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       57.9890     0.7199   80.55   <2e-16 ***
## `% under 75,000`  -0.2233     0.0172  -12.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.447 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2296, Adjusted R-squared:  0.2282 
## F-statistic: 168.7 on 1 and 566 DF,  p-value: < 2.2e-16
# income - less than $100000
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% under 100,000`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes under $100,000 (80% AMI) annually",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Households Below 80% AMI"
  )

income_100_model <- lm(`% Completely at Home` ~ `% under 100,000`, sj_ami_by_block)
summary(income_100_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `% under 100,000`, data = sj_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.026  -4.007   0.736   4.507  20.120 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       60.37827    0.83815   72.04   <2e-16 ***
## `% under 100,000` -0.22077    0.01592  -13.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.33 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2523 
## F-statistic: 192.3 on 1 and 566 DF,  p-value: < 2.2e-16
# compare to pre shelter in place
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% under 75,000`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "San Jose: Staying at Home and Households Below 50% AMI Pre Shelter-in-Place"
  )

income_75_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% under 75,000`, sj_ami_by_block)
summary(income_75_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% under 75,000`, 
##     data = sj_ami_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.2150  -2.8199  -0.0752   2.4328  27.3258 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      18.230228   0.397232  45.893  < 2e-16 ***
## `% under 75,000`  0.074344   0.009487   7.836 2.32e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.109 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.09787,    Adjusted R-squared:  0.09627 
## F-statistic:  61.4 on 1 and 566 DF,  p-value: 2.315e-14
# income - less than $100000
sj_ami_by_block %>% 
  ggplot(aes(
  x = `% under 100,000`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes under $100,000 (80% AMI) annually",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "San Jose: Staying Home and Households Below 80% AMI Pre Shelter-in-Place"
  )

income_100_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% under 100,000`, sj_ami_by_block)
summary(income_100_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% under 100,000`, 
##     data = sj_ami_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3737  -2.7043  -0.0472   2.4317  27.3698 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       17.253442   0.464232  37.166   <2e-16 ***
## `% under 100,000`  0.077203   0.008818   8.755   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.06 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1177 
## F-statistic: 76.66 on 1 and 566 DF,  p-value: < 2.2e-16

Language:

# language
sj_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking english < well`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking English less than well",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and English Language Ability"
  )

english_ability_model <- lm(`% Completely at Home` ~ `% speaking english < well`, sj_lang_by_block)
summary(english_ability_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `% speaking english < well`, 
##     data = sj_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.388  -3.914   0.397   4.999  24.526 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 52.04583    0.55037  94.566  < 2e-16 ***
## `% speaking english < well` -0.22415    0.03775  -5.938 5.02e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.307 on 567 degrees of freedom
## Multiple R-squared:  0.05855,    Adjusted R-squared:  0.05689 
## F-statistic: 35.26 on 1 and 567 DF,  p-value: 5.02e-09
sj_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking spanish`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking Spanish",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Spanish Language Ability"
  )

spanish_speaking_model <- lm(`% Completely at Home` ~ `% speaking spanish`, sj_lang_by_block)
summary(spanish_speaking_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `% speaking spanish`, data = sj_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.548  -4.051   0.741   4.658  24.705 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          53.32684    0.49214  108.36   <2e-16 ***
## `% speaking spanish` -0.17128    0.01645  -10.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.845 on 567 degrees of freedom
## Multiple R-squared:  0.1605, Adjusted R-squared:  0.159 
## F-statistic: 108.4 on 1 and 567 DF,  p-value: < 2.2e-16
# compare to pre shelter in place
sj_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking english < well`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking English less than well",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "San Jose: Staying at Home and English Language Ability Pre Shelter-in-Place"
  )

english_ability_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% speaking english < well`, sj_lang_by_block)
summary(english_ability_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% speaking english < well`, 
##     data = sj_lang_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3112  -2.8560  -0.0598   2.2760  27.7048 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 19.18999    0.26846  71.481   <2e-16 ***
## `% speaking english < well`  0.16300    0.01841   8.853   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.052 on 567 degrees of freedom
## Multiple R-squared:  0.1214, Adjusted R-squared:  0.1199 
## F-statistic: 78.37 on 1 and 567 DF,  p-value: < 2.2e-16
sj_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking spanish`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking Spanish",
    y = "Percent devices completely at home on weekdays pre shelter-in-place",
    title = "San Jose: Staying at Home and Spanish Language Ability Pre Shelter-in-Place"
  )

spanish_speaking_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% speaking spanish`, sj_lang_by_block)
summary(spanish_speaking_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% speaking spanish`, 
##     data = sj_lang_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.4706  -2.7537  -0.0822   2.4274  27.6257 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          19.384808   0.254815   76.07   <2e-16 ***
## `% speaking spanish`  0.073935   0.008518    8.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.062 on 567 degrees of freedom
## Multiple R-squared:  0.1173, Adjusted R-squared:  0.1157 
## F-statistic: 75.34 on 1 and 567 DF,  p-value: < 2.2e-16

Occupants per room:

# occupants per room
sj_occupants_per_room_by_block %>% 
  ggplot(aes(
  x = `percent 1 or more`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with more than 1 occupant per room",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Room Occupancy"
  )

occupants_model <- lm(`% Completely at Home` ~ `percent 1 or more`, sj_occupants_per_room_by_block)
summary(occupants_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent 1 or more`, data = sj_occupants_per_room_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.977  -4.137   0.286   4.849  23.842 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         51.85791    0.46833  110.73  < 2e-16 ***
## `percent 1 or more` -0.23399    0.03277   -7.14 2.89e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.126 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.08262,    Adjusted R-squared:  0.081 
## F-statistic: 50.97 on 1 and 566 DF,  p-value: 2.885e-12
# compare to pre shelter in place
sj_occupants_per_room_by_block %>% 
  ggplot(aes(
  x = `percent 1 or more`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with more than 1 occupant per room",
    y = "Percent devices completely at home on weekdays pre shelter-in-place",
    title = "San Jose: Staying at Home and Room Occupancy Pre Shelter-in-Place"
  )

occupants_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent 1 or more`, sj_occupants_per_room_by_block)
summary(occupants_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent 1 or more`, 
##     data = sj_occupants_per_room_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3938  -2.8717   0.0566   2.5198  27.1950 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         19.61616    0.23366  83.952   <2e-16 ***
## `percent 1 or more`  0.14478    0.01635   8.854   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.054 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1217, Adjusted R-squared:  0.1201 
## F-statistic:  78.4 on 1 and 566 DF,  p-value: < 2.2e-16

Vehicle ownership:

# vehicles
sj_vehicles_by_block %>% 
  ggplot(aes(
  x = `vehicles per capita`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Number of vehicles per capita",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Vehicles Per Capita"
  )

# vehicles - percent with no vehicles
sj_no_vehicles_by_block %>% 
  ggplot(aes(
  x = `percent no vehicles`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with no vehicle available",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Vehicle Availability"
  )

vehicles_model <- lm(`% Completely at Home` ~ `percent no vehicles`, sj_no_vehicles_by_block)
summary(vehicles_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent no vehicles`, 
##     data = sj_no_vehicles_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.404  -4.625   0.235   5.096  24.696 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           51.00370    0.43520 117.196  < 2e-16 ***
## `percent no vehicles` -0.29763    0.05439  -5.473 6.67e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.268 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.05025,    Adjusted R-squared:  0.04858 
## F-statistic: 29.95 on 1 and 566 DF,  p-value: 6.672e-08
# compare to pre shelter in place
sj_no_vehicles_by_block %>% 
  ggplot(aes(
  x = `percent no vehicles`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with no vehicle available",
    y = "Percent devices completely at home on weekdays pre shelter-in-place",
    title = "San Jose: Social Distancing and Vehicle Availability Pre Shelter-in-Place"
  )

vehicles_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent no vehicles`, sj_no_vehicles_by_block)
summary(vehicles_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent no vehicles`, 
##     data = sj_no_vehicles_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8094  -2.9241  -0.0556   2.7766  24.6695 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           20.36142    0.22283  91.378  < 2e-16 ***
## `percent no vehicles`  0.13931    0.02785   5.003 7.55e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.233 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.04235,    Adjusted R-squared:  0.04066 
## F-statistic: 25.03 on 1 and 566 DF,  p-value: 7.55e-07

Multiple regression analysis with income, age, language, and occupants per room

# multiple regression 
modeltest <- lm(sj_ami_by_block$`% Completely at Home` ~ sj_ami_by_block$`% under 100,000` + sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english < well` + sj_occupants_per_room_by_block$`percent 1 or more`)
summary(modeltest)
## 
## Call:
## lm(formula = sj_ami_by_block$`% Completely at Home` ~ sj_ami_by_block$`% under 100,000` + 
##     sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english < well` + 
##     sj_occupants_per_room_by_block$`percent 1 or more`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.056  -4.073   0.593   4.495  18.560 
## 
## Coefficients:
##                                                     Estimate Std. Error t value
## (Intercept)                                        62.088401   1.629593  38.101
## sj_ami_by_block$`% under 100,000`                  -0.233430   0.020969 -11.132
## sj_age_by_block$`percent less than 30`             -0.050195   0.042165  -1.190
## sj_lang_by_block$`% speaking english < well`        0.068559   0.045802   1.497
## sj_occupants_per_room_by_block$`percent 1 or more`  0.006381   0.046449   0.137
##                                                    Pr(>|t|)    
## (Intercept)                                          <2e-16 ***
## sj_ami_by_block$`% under 100,000`                    <2e-16 ***
## sj_age_by_block$`percent less than 30`                0.234    
## sj_lang_by_block$`% speaking english < well`          0.135    
## sj_occupants_per_room_by_block$`percent 1 or more`    0.891    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.32 on 563 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2595, Adjusted R-squared:  0.2542 
## F-statistic: 49.31 on 4 and 563 DF,  p-value: < 2.2e-16

Experimentation

Experimentation with other variables and other ways of analyzing the social distancing data. First I look at a few other possible variables. I start with units in the structure.

# try getting other variables
# get data on units in structure
sj_units_in_structure_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B25024)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, "units"), sep = "!!") %>% 
  filter(!is.na(units)) %>%
  spread(key = units, value = estimate) %>%
  mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, `percent 20 or more` = (`20 to 49`+`50 or more`)* 100/ total_nums, `percent 1 only` = (`1, attached` + `1, detached`)*100/total_nums, `percent > 1` = 100 - `percent 1 only`) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))

# plot 
sj_units_in_structure_by_block %>% 
  ggplot(aes(
  x = `percent 20 or more`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of structures with more than 20 units",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and 20 or More Units Per Structure"
  )

summary(lm(`% Completely at Home` ~ `percent 20 or more`, sj_units_in_structure_by_block))
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent 20 or more`, data = sj_units_in_structure_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.335  -4.843   0.201   5.190  25.765 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          49.93469    0.40940 121.971   <2e-16 ***
## `percent 20 or more` -0.03712    0.02052  -1.809    0.071 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.459 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.005748,   Adjusted R-squared:  0.003992 
## F-statistic: 3.272 on 1 and 566 DF,  p-value: 0.07099
sj_units_in_structure_by_block %>% 
  ggplot(aes(
  x = `percent 1 only`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of structures with only one unit",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Only 1 Unit Per Structure"
  )

summary(lm(`% Completely at Home` ~ `percent 1 only`, sj_units_in_structure_by_block))
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent 1 only`, data = sj_units_in_structure_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.261  -4.405   0.224   5.078  24.608 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      44.75451    0.88920  50.331  < 2e-16 ***
## `percent 1 only`  0.06648    0.01132   5.872 7.33e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.237 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.05743,    Adjusted R-squared:  0.05576 
## F-statistic: 34.48 on 1 and 566 DF,  p-value: 7.328e-09

Household type and size:

# load data on household type and size
sj_house_size_type_by_block <- getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B11016)"
  ) %>% 
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>% 
  filter(!is.na(type))


# household type
sj_house_type_by_block <- sj_house_size_type_by_block %>% 
  filter(is.na(size)) %>% 
  dplyr::select(-size) %>%
  spread(key = type, value = estimate) %>% 
  mutate(`total households` = `Family households` + `Nonfamily households`, `percent nonfamily` = `Nonfamily households` / `total households`) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))

sj_house_type_by_block %>% 
  ggplot(aes(
  x = `percent nonfamily`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent nonfamily households",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Household Type"
  )

summary(lm(`% Completely at Home` ~ `percent nonfamily`, sj_house_type_by_block))
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent nonfamily`, data = sj_house_type_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.208  -4.597  -0.052   5.089  23.828 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          52.2308     0.6403  81.578  < 2e-16 ***
## `percent nonfamily` -11.0270     2.2223  -4.962 9.24e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.305 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.04169,    Adjusted R-squared:  0.03999 
## F-statistic: 24.62 on 1 and 566 DF,  p-value: 9.245e-07
# household size
sj_house_size_by_block <- sj_house_size_type_by_block %>% 
  filter(!is.na(size)) %>% 
  dplyr::select(-type) %>%
  group_by(blockgroup, size) %>%
  summarize(`total of this size` = sum(estimate)) %>% 
  spread(key = size, value = `total of this size`) %>%
  mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`, `percent 5 or more` = (`5-person household`+`6-person household` + `7-or-more person household`)* 100/ total_nums, `percent 1 or 2 only` = (`1-person household` + `2-person household`)*100/total_nums) %>%
  left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))

sj_house_size_by_block %>% 
  ggplot(aes(
  x = `percent 5 or more`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with 5 or more people",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Households With 5 or More"
  )

summary(lm(`% Completely at Home` ~ `percent 5 or more`, sj_house_size_by_block))
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent 5 or more`, data = sj_house_size_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.588  -4.419   0.605   4.958  25.419 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         51.32314    0.56617  90.650  < 2e-16 ***
## `percent 5 or more` -0.10054    0.02541  -3.957 8.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.369 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.02692,    Adjusted R-squared:  0.0252 
## F-statistic: 15.66 on 1 and 566 DF,  p-value: 8.545e-05
sj_house_size_by_block %>% 
  ggplot(aes(
  x = `percent 1 or 2 only`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with 1 or 2 people",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Small Household Size"
  )

summary(lm(`% Completely at Home` ~ `percent 1 or 2 only`, sj_house_size_by_block))
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent 1 or 2 only`, 
##     data = sj_house_size_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.673  -4.811   0.223   5.383  25.408 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           50.54506    0.98251  51.445   <2e-16 ***
## `percent 1 or 2 only` -0.02185    0.02043  -1.069    0.285    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.475 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.002016,   Adjusted R-squared:  0.0002529 
## F-statistic: 1.143 on 1 and 566 DF,  p-value: 0.2854

Next I consider different ways of looking at the social distancing data. First I try distance traveled.

# try other ways of looking at the social distancing data
# first look at total distance traveled
sj_sd_distance <- sj_socialdistancing %>% 
  filter(date > shelter_start) %>% 
  group_by(origin_census_block_group) %>% 
  summarize(total_dist_traveled = sum(distance_traveled_from_home), device_count = sum(device_count)) %>%
  mutate(total_dist_per_device = total_dist_traveled / device_count)

sj_distance_testing <- left_join(sj_ami_by_block, sj_sd_distance, by = c("blockgroup" = "origin_census_block_group")) %>% left_join(sj_age_by_block %>% select(blockgroup, `percent less than 30`))

sj_distance_testing %>% filter(total_dist_per_device < 500)  %>% 
  ggplot(aes(
  x = `% under 75,000`,
  y = total_dist_per_device
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
    y = "Average distance traveled per device during weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Income, Distance Metric"
  )

This is very skewed by outliers, and probably not a useful metric.

Now I consider including devices that traveled <1km as staying at (or near) home.

sj_sd_range <- sj_socialdistancing %>% 
  filter(weekend == F) %>% 
  filter(date > shelter_start) %>%
  mutate(travel_buckets_split = lapply(bucketed_distance_traveled, function(x) strsplit(x, "<1000")[[1]][2]), less_than_1km = lapply(travel_buckets_split, function(x) strsplit(x, ":")[[1]][2]), less_than_1km = lapply(less_than_1km, function(x) strsplit(x, ",")[[1]][1])) %>%
  mutate(less_than_1km = lapply(less_than_1km, function(x) str_remove(x, "[}]")))  %>% # clean a bit more
  mutate(less_than_1km = as.numeric(less_than_1km), less_than_1km = replace_na(less_than_1km, 0)) %>% 
  mutate(home_or_1km = completely_home_device_count + less_than_1km) %>% 
  group_by(origin_census_block_group) %>% 
  summarize(home_or_1km = sum(home_or_1km), device_count = sum(device_count)) %>% 
  mutate(`% Within 1km of Home` = (home_or_1km/device_count*100) %>% round(1))

# join this with other data
sj_1km_testing <- left_join(sj_ami_by_block, sj_sd_range, by = c("blockgroup" = "origin_census_block_group")) %>% 
  left_join(sj_occupants_per_room_by_block %>% dplyr::select(`percent 1 or more`, blockgroup)) %>%
  left_join(sj_age_by_block %>% dplyr::select(`percent less than 30`, blockgroup)) %>%
  left_join(sj_lang_by_block %>% dplyr::select(`% speaking english < well`, blockgroup)) 

# plot with income
sj_1km_testing %>%  
  ggplot(aes(
  x = `% under 75,000`,
  y = `% Within 1km of Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
    y = "% of devices within 1km of home, weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Income, 1km Range"
  )

summary(lm(`% Within 1km of Home` ~ `% under 75,000`, sj_1km_testing))
## 
## Call:
## lm(formula = `% Within 1km of Home` ~ `% under 75,000`, data = sj_1km_testing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.458  -4.266   0.401   4.993  24.159 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       67.1812     0.7578   88.65   <2e-16 ***
## `% under 75,000`  -0.2343     0.0181  -12.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.839 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2285, Adjusted R-squared:  0.2271 
## F-statistic: 167.6 on 1 and 566 DF,  p-value: < 2.2e-16
# plot with age
sj_1km_testing %>%  
  ggplot(aes(
  x = `percent less than 30`,
  y = `% Within 1km of Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of people younger than 30",
    y = "Percent of devices within 1km of home during weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Age, 1km Range"
  )

summary(lm(`% Within 1km of Home` ~ `percent less than 30`, sj_1km_testing))
## 
## Call:
## lm(formula = `% Within 1km of Home` ~ `percent less than 30`, 
##     data = sj_1km_testing)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.271  -4.766   0.304   5.309  25.413 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            68.78200    1.57767  43.597  < 2e-16 ***
## `percent less than 30` -0.27326    0.03999  -6.833 2.15e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.651 on 567 degrees of freedom
## Multiple R-squared:  0.07608,    Adjusted R-squared:  0.07445 
## F-statistic: 46.69 on 1 and 567 DF,  p-value: 2.154e-11
# run multiple regression model
modeltest2 <- lm(sj_1km_testing$`% Within 1km of Home` ~ sj_1km_testing$`% under 75,000` + sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english < well` + sj_1km_testing$`percent 1 or more`)
summary(modeltest2)
## 
## Call:
## lm(formula = sj_1km_testing$`% Within 1km of Home` ~ sj_1km_testing$`% under 75,000` + 
##     sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english < well` + 
##     sj_1km_testing$`percent 1 or more`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.972  -4.599   0.793   4.765  23.400 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                71.77102    1.67952  42.733  < 2e-16
## sj_1km_testing$`% under 75,000`            -0.20967    0.02267  -9.249  < 2e-16
## sj_1km_testing$`percent less than 30`      -0.14057    0.04466  -3.148  0.00173
## sj_1km_testing$`% speaking english < well` -0.02227    0.04822  -0.462  0.64432
## sj_1km_testing$`percent 1 or more`          0.01303    0.04897   0.266  0.79024
##                                               
## (Intercept)                                ***
## sj_1km_testing$`% under 75,000`            ***
## sj_1km_testing$`percent less than 30`      ** 
## sj_1km_testing$`% speaking english < well`    
## sj_1km_testing$`percent 1 or more`            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.77 on 563 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.246,  Adjusted R-squared:  0.2406 
## F-statistic: 45.92 on 4 and 563 DF,  p-value: < 2.2e-16

It looks like the fit of these selected variables is slightly better for the social distancing data based on not traveling farther than 1km.

Now I also consider “non-work” behavior.

sj_nonworking_by_block <- sj_socialdistancing %>% 
  filter(weekend == F) %>% 
  filter(date > shelter_start) %>%
  mutate(nonworking = device_count - completely_home_device_count - part_time_work_behavior_devices - full_time_work_behavior_devices) %>%
  group_by(origin_census_block_group) %>%
  summarize(nonworking_count = sum(nonworking), total_device = sum(device_count)) %>% 
  mutate(nonworking_percent = nonworking_count*100 / total_device, percent_only_work_home = 100-nonworking_percent) %>%
  left_join(sj_1km_testing %>% dplyr::select(`% under 75,000`, `percent less than 30`, `% speaking english < well`, `percent 1 or more`, blockgroup), by = c("origin_census_block_group" = "blockgroup"))


# plot against age and income
sj_nonworking_by_block %>%  
  ggplot(aes(
  x = `% under 75,000`,
  y = percent_only_work_home
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with incomes under $75,000 (50% AMI) annually",
    y = "Percent of devices completely at home or leaving home for non-work purposes during weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Income, Nonworking Behavior"
  )

summary(lm(percent_only_work_home ~ `% under 75,000`, sj_nonworking_by_block))
## 
## Call:
## lm(formula = percent_only_work_home ~ `% under 75,000`, data = sj_nonworking_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2184  -3.1028   0.1581   3.1343  22.1765 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      71.53383    0.49714  143.89   <2e-16 ***
## `% under 75,000` -0.12775    0.01187  -10.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.142 on 566 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1698, Adjusted R-squared:  0.1683 
## F-statistic: 115.8 on 1 and 566 DF,  p-value: < 2.2e-16
sj_nonworking_by_block %>%  
  ggplot(aes(
  x = `percent less than 30`,
  y = percent_only_work_home
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of people younger than 30",
    y = "Percent of devices completely at home or leaving home for non-work purposes during weekdays since shelter-in-place",
    title = "San Jose: Social Distancing and Age, Nonworking Behavior"
  )

summary(lm(percent_only_work_home ~ `percent less than 30`, sj_nonworking_by_block))
## 
## Call:
## lm(formula = percent_only_work_home ~ `percent less than 30`, 
##     data = sj_nonworking_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.7223  -3.4844   0.2204   3.5775  22.6311 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            68.83712    1.02739  67.002   <2e-16 ***
## `percent less than 30` -0.05479    0.02604  -2.104   0.0358 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.633 on 567 degrees of freedom
## Multiple R-squared:  0.007747,   Adjusted R-squared:  0.005997 
## F-statistic: 4.427 on 1 and 567 DF,  p-value: 0.03582
# multiple regression model
modeltest3 <- lm(sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% under 75,000` + sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english < well` + sj_nonworking_by_block$`percent 1 or more`)
summary(modeltest3)
## 
## Call:
## lm(formula = sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% under 75,000` + 
##     sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english < well` + 
##     sj_nonworking_by_block$`percent 1 or more`)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.9975  -3.2297   0.0295   3.1285  16.9204 
## 
## Coefficients:
##                                                    Estimate Std. Error t value
## (Intercept)                                        30.89719    1.10085  28.067
## sj_nonworking_by_block$`% under 75,000`             0.10453    0.01486   7.035
## sj_nonworking_by_block$`percent less than 30`      -0.07166    0.02927  -2.448
## sj_nonworking_by_block$`% speaking english < well`  0.04189    0.03161   1.326
## sj_nonworking_by_block$`percent 1 or more`          0.07375    0.03210   2.298
##                                                    Pr(>|t|)    
## (Intercept)                                         < 2e-16 ***
## sj_nonworking_by_block$`% under 75,000`            5.82e-12 ***
## sj_nonworking_by_block$`percent less than 30`        0.0147 *  
## sj_nonworking_by_block$`% speaking english < well`   0.1855    
## sj_nonworking_by_block$`percent 1 or more`           0.0219 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.093 on 563 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:   0.19,  Adjusted R-squared:  0.1842 
## F-statistic: 33.01 on 4 and 563 DF,  p-value: < 2.2e-16

These variables do worse for the percent nonworking metric, which makes sense.